This Python notebook compares the built-in explainability features of XGBoost with the one provided by LIME. The outline of this notebook is as follows
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelBinarizer, StandardScaler, LabelEncoder
import xgboost as xgb
import lime.lime_tabular
from graphviz import Digraph
In [2]:
np.random.seed(1)
%matplotlib inline
In [3]:
def _parse_node_patched_with_leaf_index(graph, text):
"""parse dumped node"""
match = xgb.plotting._NODEPAT.match(text)
if match is not None:
node = match.group(1)
graph.node(node, label=match.group(2), shape='circle')
return node
match = xgb.plotting._LEAFPAT.match(text)
if match is not None:
node = match.group(1)
graph.node(node, label=match.group(2) +
'\nleaf index: ' + node, shape='box')
return node
raise ValueError('Unable to parse node: {0}'.format(text))
old_parse_node = xgb.plotting._parse_node
xgb.plotting._parse_node = _parse_node_patched_with_leaf_index
The Shapley value is a solution concept in cooperative game theory.
SHAP values are defined as the Shapley values of a conditional expectation function and are introduced in the publication A unified approach to interpreting model predictions.
fever
and cough
.
In [4]:
columns_ex = ['fever', 'cough', 'bias']
def plot_sickness_graph(label, contributions, cough_yes_leaves=[0, 100]):
def format_contributions(contribs):
zipped = zip(columns_ex, contribs[0])
def combine_contrib(tup): return ": ".join(str(i) for i in tup)
return '\n'.join(map(combine_contrib, zipped))
dot = Digraph()
dot.attr('graph', label=label)
dot.attr('graph', labelloc='t')
dot.attr('graph', rank='LR')
dot.attr('node', shape='box')
dot.node('R', 'Cough')
dot.node('F1', 'Fever')
dot.node('F2', 'Fever')
dot.node('L1', '0', shape='plaintext')
dot.node('L2', '0', shape='plaintext')
dot.node('L3', str(cough_yes_leaves[0]), shape='plaintext')
dot.node('L4', str(cough_yes_leaves[1]), shape='plaintext')
dot.edge('R', 'F1', label='No')
dot.edge('R', 'F2', label='Yes')
dot.edge('F1', 'L1', label='No')
dot.edge('F1', 'L2', label='Yes')
dot.edge('F2', 'L3', label='No')
dot.edge('F2', 'L4', label='Yes')
dot.node('Feature contributions\n' +
format_contributions(contributions), shape='plaintext')
return dot
The following two instances of the sickness score example illustrate the contribution values calculated before XGBoost was extended with Tree SHAP, ie. before the pull request #2438 was merged and before commit 78c4188cec31425f708d238160ea3afb67a7250a.
In [5]:
contributions_and = [[39.33431244, 35.66350174, 24.99929047]]
plot_sickness_graph(
'Sickness score example #1 (simple AND)', contributions_and)
Out[5]:
In [6]:
contributions = np.array([[49.9985466, 29.99912071, 29.99913406]])
plot_sickness_graph(
'Sickness score example #2 (with increased impact of "cough")', contributions, [10, 110])
Out[6]:
In [7]:
contribution_shift_pre_shap = pd.DataFrame(
contributions - contributions_and, columns=columns_ex)
contribution_shift_pre_shap
Out[7]:
The next two instances correspond to the previous ones but the contributions are calculated with Tree SHAP
cough
increases in the second instance
In [8]:
data = np.array([[0, 0, 0],
[1, 0, 0],
[0, 1, 0],
[1, 1, 100]]).repeat([100, 100, 100, 100], axis=0)
sickness = pd.DataFrame(data, columns=['fever', 'cough', 'sickness'])
bst_test = xgb.XGBRegressor(objective='reg:linear')
bst_test.fit(sickness.iloc[:, :-1].values, sickness.iloc[:, -1])
dmatrix = xgb.DMatrix(np.array([[1, 1]]))
contributions_and_new = bst_test.get_booster().predict(dmatrix, pred_contribs=True)
plot_sickness_graph('Sickness score example #1 (simple AND)',
contributions_and_new)
Out[8]:
In [9]:
data = np.array([[0, 0, 0],
[1, 0, 0],
[0, 1, 10],
[1, 1, 110]]).repeat([100, 100, 100, 100], axis=0)
sickness = pd.DataFrame(data, columns=['fever', 'cough', 'sickness'])
bst_test.fit(sickness.iloc[:, :-1].values, sickness.iloc[:, -1])
dmatrix = xgb.DMatrix(np.array([[1, 1]]))
contributions_new = bst_test.get_booster().predict(dmatrix, pred_contribs=True)
plot_sickness_graph(
'Sickness score example #2 (with increased impact of "cough")', contributions_new, [10, 110])
Out[9]:
In [10]:
contribution_shift_post_shap = pd.DataFrame(
contributions_new - contributions_and_new, columns=columns_ex)
contribution_shift_post_shap
Out[10]:
The dataset being used is the Adult Census Data Set, which has a significant amount of instances and attributes and contains continuous as well as categorical attributes.
In [11]:
column_names = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status", "Occupation",
"Relationship", "Race", "Sex", "Capital Gain", "Capital Loss", "Hours per week", "Country", "Label"]
df = pd.read_csv('data/adult.data', names=column_names, index_col=False)
In [12]:
df
Out[12]:
In [13]:
from sklearn_pandas import DataFrameMapper, cross_val_score
label_binarizer = LabelBinarizer()
mapper = DataFrameMapper([
(['Age'], StandardScaler()),
('Workclass', LabelEncoder()),
(['fnlwgt'], StandardScaler()),
('Education', LabelEncoder()),
(['Education-Num'], StandardScaler()),
('Marital Status', LabelEncoder()),
('Occupation', LabelEncoder()),
('Relationship', LabelEncoder()),
('Race', LabelEncoder()),
('Sex', LabelEncoder()),
(['Capital Gain'], StandardScaler()),
(['Capital Loss'], StandardScaler()),
(['Hours per week'], StandardScaler()),
('Country', LabelEncoder()),
('Label', label_binarizer)
],
df_out=True
)
fitted_df = mapper.fit_transform(df)
In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
fitted_df.iloc[:, :-1], fitted_df.iloc[:, -1], test_size=0.20)
bst = xgb.XGBClassifier(n_estimators=300, max_depth=5)
bst.fit(X_train.values, y_train.values)
Out[14]:
In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test.values, bst.predict(X_test.values))
Out[15]:
In [16]:
i = 1412
X_train.iloc[i, :]
Out[16]:
In [17]:
booster = bst.get_booster()
inst = np.array([X_train.iloc[i, :].values])
prediction = bst.predict(inst)
print('predicted class (transformed to 0/1): ' + str(bst.predict(inst)[0]))
print('actual class (transformed to 0/1): ' + str(y_train[i]))
print('precicted class: {} of {}'.format(
str(label_binarizer.classes_[int(prediction)]), label_binarizer.classes_))
In [18]:
dtrain = xgb.DMatrix(np.array([X_train.iloc[i, :].values]))
print('Predicted probability: ' + str(booster.predict(dtrain)))
print('Predicted raw value: {}\n'.format(
booster.predict(dtrain, output_margin=True)))
contribs = booster.predict(dtrain, pred_contribs=True)
contributions2 = zip(column_names[:-1] + ['Bias'], contribs[0])
print('Contributions (sorted by absolute value):\n' + '\n'.join(str(x)
for x in sorted(contributions2, key=lambda x: abs(x[1]), reverse=True)))
print("Sum of contributions {}".format(
booster.predict(dtrain, pred_contribs=True).sum()))
In [19]:
import shap
# load JS visualization code to notebook
shap.initjs()
In [20]:
dmatrix = xgb.DMatrix(X_train.values)
shap_values = bst.get_booster().predict(dmatrix, pred_contribs=True)
# visualize the first prediction's explaination
shap.visualize(
shap_values[i, :], feature_names=df.columns[:-1], data=X_train.iloc[i, :].values)
Out[20]:
In [21]:
categorical_features_idxs = [1, 3, 5, 6, 7, 8, 9, 13]
categorical_names = {}
for feature in categorical_features_idxs:
le = sklearn.preprocessing.LabelEncoder()
le.fit(df.iloc[:, feature])
categorical_names[feature] = le.classes_
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, # fitted_df.iloc[:,:-1].values.astype(float),
feature_names=['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num',
'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex',
'Capital gain', 'Capital Loss', 'Hours per week', 'Country'],
categorical_features=categorical_features_idxs,
categorical_names=categorical_names,
class_names=label_binarizer.classes_,
kernel_width=3, verbose=False)
In [22]:
# Custom predict function not necessary, as all date has been transformed already
# predict_fn = lambda x: bst.predict_proba(mapper.fit_transform(x)).astype(float)
exp = explainer.explain_instance(
X_train.iloc[i, :].values, bst.predict_proba, num_features=4)
exp.show_in_notebook(show_all=False)
In [23]:
exp.show_in_notebook(show_all=False)
shap.visualize(
shap_values[i, :], feature_names=df.columns[:-1], data=X_train.iloc[i, :].values)
Out[23]:
In [24]:
legend_text = """Instance: {0}
{1}
Predicted leaf: {2}
""".format(i, X_train.iloc[i, :], booster.predict(dtrain, pred_leaf=True)[0][0])
legend_test = '''
Instance: + str(i)
'''
In [25]:
%config InlineBackend.figure_format = 'retina'
plt.rc('figure', figsize=(50, 50))
fig = plt.figure()
ax = fig.add_subplot(111)
xgb.plot_tree(bst, fmap='data/adult.fmap', num_trees=0, rankdir='LR', ax=ax)
ax.text(0.0, 0.7, legend_text, bbox={
'facecolor': 'gray', 'alpha': 0.5, 'pad': 5}, fontsize=30, transform=ax.transAxes)
plt.show()
In [26]:
plt.rc('figure', figsize=(15, 15))
scores = booster.get_score(fmap='data/adult.fmap', importance_type='gain')
xgb.plot_importance(scores, xlabel='Gain')
Out[26]: